1. Loading libraries
To see which packages are used, please inspect the load_libraries and regression_metrics files.
source('load_libraries.R')
source("regression_metrics.R")2. Loading + organising data
Via gist paths, the datasets are loaded in data.table format.
house_train <- fread("https://gist.githubusercontent.com/geraldwal/c15f40a258a15f6417e70705d57e9a21/raw/800519c3f4d633b8985cf9fc4b60ebccc89494eb/house_price_train")
house_test <- fread("https://gist.githubusercontent.com/geraldwal/58f48ae5a44f7061ced0c3486e2f07aa/raw/5a77070d797ffa9598c0eb6f37facb6a2a20c78b/house_price_test")
head(house_train) id date price bedrooms bathrooms sqft_living sqft_lot
1: 9183703376 5/13/2014 225000 3 1.50 1250 7500
2: 464000600 8/27/2014 641250 3 2.50 2220 2550
3: 2224079050 7/18/2014 810000 4 3.50 3980 209523
4: 6163901283 1/30/2015 330000 4 1.50 1890 7540
5: 6392003810 5/23/2014 530000 4 1.75 1814 5000
6: 7974200948 5/20/2014 953007 4 3.50 3120 5086
floors waterfront view condition grade sqft_above sqft_basement
1: 1 0 0 3 7 1250 0
2: 3 0 2 3 10 2220 0
3: 2 0 2 3 9 3980 0
4: 1 0 0 4 7 1890 0
5: 1 0 0 4 7 944 870
6: 2 0 0 3 9 2480 640
yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
1: 1967 0 98030 47.3719 -122.215 1260 7563
2: 1990 0 98117 47.6963 -122.393 2200 5610
3: 2006 0 98024 47.5574 -121.890 2220 65775
4: 1967 0 98155 47.7534 -122.318 1890 8515
5: 1951 0 98115 47.6840 -122.281 1290 5000
6: 2008 0 98115 47.6762 -122.288 1880 5092
head(house_test) id date bedrooms bathrooms sqft_living sqft_lot floors
1: 6414100192 12/9/2014 3 2.25 2570 7242 2.0
2: 6054650070 10/7/2014 3 1.75 1370 9680 1.0
3: 16000397 12/5/2014 2 1.00 1200 9850 1.0
4: 2524049179 8/26/2014 3 2.75 3050 44867 1.0
5: 8562750320 11/10/2014 3 2.50 2320 3980 2.0
6: 7589200193 11/10/2014 3 1.00 1090 3000 1.5
waterfront view condition grade sqft_above sqft_basement yr_built
1: 0 0 3 7 2170 400 1951
2: 0 0 4 7 1370 0 1977
3: 0 0 4 7 1200 0 1921
4: 0 4 3 9 2330 720 1968
5: 0 0 3 8 2320 0 2003
6: 0 0 4 8 1090 0 1929
yr_renovated zipcode lat long sqft_living15 sqft_lot15
1: 1991 98125 47.7210 -122.319 1690 7639
2: 0 98074 47.6127 -122.045 1370 10208
3: 0 98002 47.3089 -122.210 1060 5095
4: 0 98040 47.5316 -122.233 4110 20336
5: 0 98027 47.5391 -122.070 2580 3980
6: 0 98117 47.6889 -122.375 1570 5080
In this chunk, a split column is added in order to easily split and stack the datasets when needed.
house_test$split <- "test"
house_test$price <- 0
house_train$split <- "train"
house_data <- rbind(house_train, house_test)
str(house_data)Classes 'data.table' and 'data.frame': 21597 obs. of 22 variables:
$ id :integer64 9183703376 464000600 2224079050 6163901283 6392003810 7974200948 2426059124 2115510300 ...
$ date : chr "5/13/2014" "8/27/2014" "7/18/2014" "1/30/2015" ...
$ price : num 225000 641250 810000 330000 530000 ...
$ bedrooms : int 3 3 4 4 4 4 4 3 4 3 ...
$ bathrooms : num 1.5 2.5 3.5 1.5 1.75 3.5 3.25 2.25 2.5 1.5 ...
$ sqft_living : int 1250 2220 3980 1890 1814 3120 4160 1440 2250 2540 ...
$ sqft_lot : int 7500 2550 209523 7540 5000 5086 47480 10500 6840 9520 ...
$ floors : num 1 3 2 1 1 2 2 1 2 1 ...
$ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
$ view : int 0 2 2 0 0 0 0 0 0 0 ...
$ condition : int 3 3 3 4 4 3 3 3 3 3 ...
$ grade : int 7 10 9 7 7 9 10 8 9 8 ...
$ sqft_above : int 1250 2220 3980 1890 944 2480 4160 1130 2250 1500 ...
$ sqft_basement: int 0 0 0 0 870 640 0 310 0 1040 ...
$ yr_built : int 1967 1990 2006 1967 1951 2008 1995 1983 1987 1959 ...
$ yr_renovated : int 0 0 0 0 0 0 0 0 0 0 ...
$ zipcode : int 98030 98117 98024 98155 98115 98115 98072 98023 98058 98115 ...
$ lat : num 47.4 47.7 47.6 47.8 47.7 ...
$ long : num -122 -122 -122 -122 -122 ...
$ sqft_living15: int 1260 2200 2220 1890 1290 1880 3400 1510 2480 1870 ...
$ sqft_lot15 : int 7563 5610 65775 8515 5000 5092 40428 8125 7386 6800 ...
$ split : chr "train" "train" "train" "train" ...
- attr(*, ".internal.selfref")=<externalptr>
# Copy to continue working on
hd <- house_dataOnly keep unique rows.
sum(duplicated(hd$id))[1] 177
hd <- hd[!duplicated(hd$id), ]
hdTrain <- hd[which(hd$split == "train"),]
hdTest <- hd[which(hd$split == "test"),]3. EDA
Analysis of the different variables, based on their individual distributions and relation according to the target variable price.
table(hd$floors)
1 1.5 2 2.5 3 3.5
10552 1888 8203 161 609 7
unique(house_data$bedrooms) [1] 3 4 5 2 6 1 7 8 9 10 33 11
unique(house_data$zipcode) [1] 98030 98117 98024 98155 98115 98072 98023 98058 98056 98045 98028
[12] 98118 98144 98074 98108 98008 98027 98038 98107 98133 98166 98065
[23] 98042 98040 98022 98006 98033 98052 98034 98092 98112 98198 98178
[34] 98106 98168 98053 98014 98077 98103 98001 98029 98019 98031 98075
[45] 98059 98136 98039 98011 98119 98122 98126 98177 98003 98116 98105
[56] 98005 98032 98146 98109 98199 98055 98070 98007 98002 98125 98148
[67] 98010 98004 98188 98102
plot(table(house_train$bedrooms))plot(table(house_train$yr_renovated))plot(table(house_train$sqft_basement))unique(house_data$long) [1] -122.215 -122.393 -121.890 -122.318 -122.281 -122.288 -122.115
[8] -122.391 -122.151 -122.306 -122.182 -121.707 -122.244 -122.104
[15] -122.266 -122.297 -122.174 -122.051 -122.295 -122.301 -122.396
[22] -122.105 -121.774 -122.230 -122.092 -122.041 -122.367 -122.350
[29] -122.339 -122.122 -121.879 -122.119 -122.022 -122.128 -122.316
[36] -122.213 -122.027 -122.114 -122.171 -122.103 -122.214 -122.087
[43] -122.191 -121.880 -122.173 -122.309 -122.247 -122.290 -122.358
[50] -121.966 -122.277 -122.210 -122.268 -122.074 -122.089 -122.190
[57] -122.047 -122.017 -122.362 -122.274 -122.395 -121.920 -122.052
[64] -122.223 -122.033 -122.167 -122.335 -122.161 -122.331 -122.152
[71] -122.365 -122.340 -122.276 -122.018 -122.338 -122.349 -121.983
[78] -122.222 -122.045 -121.994 -122.116 -122.291 -122.144 -122.176
[85] -122.184 -122.389 -122.257 -122.226 -122.181 -122.313 -122.100
[92] -122.206 -122.066 -122.400 -122.133 -122.278 -122.361 -122.299
[99] -122.351 -122.056 -122.264 -122.200 -122.371 -122.279 -122.384
[106] -122.307 -122.140 -122.235 -122.319 -122.368 -122.189 -122.369
[113] -122.029 -121.951 -122.390 -122.370 -122.308 -121.973 -122.168
[120] -122.156 -122.330 -122.324 -122.280 -122.229 -121.984 -122.317
[127] -122.285 -122.379 -122.377 -122.157 -122.153 -122.164 -122.386
[134] -122.347 -122.064 -122.071 -122.134 -122.023 -122.388 -121.999
[141] -122.356 -122.013 -122.038 -122.253 -121.976 -122.021 -122.232
[148] -122.397 -122.172 -122.188 -122.108 -122.242 -122.387 -122.314
[155] -122.170 -122.372 -122.149 -122.272 -122.376 -122.135 -122.273
[162] -122.097 -122.342 -121.878 -122.286 -122.150 -122.030 -122.322
[169] -121.709 -122.165 -122.270 -122.085 -122.402 -122.121 -122.058
[176] -122.403 -121.871 -122.246 -122.267 -122.289 -122.073 -122.055
[183] -122.040 -121.969 -121.970 -122.007 -121.764 -122.109 -122.217
[190] -122.378 -122.192 -122.185 -122.298 -122.065 -122.256 -122.444
[197] -122.132 -122.227 -122.259 -122.175 -122.025 -122.009 -122.399
[204] -122.409 -122.112 -122.294 -122.383 -122.243 -122.196 -122.169
[211] -122.123 -122.237 -122.154 -122.332 -122.392 -122.300 -122.357
[218] -122.024 -122.304 -122.346 -122.094 -122.233 -122.062 -122.020
[225] -122.303 -122.099 -122.197 -122.359 -122.036 -122.083 -121.912
[232] -122.284 -121.748 -122.186 -122.320 -122.117 -122.158 -122.178
[239] -122.032 -122.068 -122.265 -121.967 -122.031 -122.310 -122.141
[246] -122.225 -121.883 -122.204 -121.996 -122.130 -121.841 -122.208
[253] -122.124 -122.328 -122.269 -121.961 -121.902 -122.260 -121.842
[260] -122.315 -122.275 -122.106 -122.003 -122.327 -121.982 -122.353
[267] -122.312 -122.287 -122.162 -122.126 -122.180 -122.203 -122.147
[274] -122.216 -122.166 -122.302 -122.160 -122.050 -122.321 -122.283
[281] -122.352 -122.375 -122.187 -122.385 -122.404 -121.772 -122.245
[288] -122.271 -122.136 -121.965 -122.049 -122.363 -122.205 -122.070
[295] -122.221 -122.212 -121.909 -122.113 -122.037 -122.120 -122.311
[302] -122.219 -122.125 -122.046 -122.412 -122.231 -122.195 -122.252
[309] -121.770 -122.373 -122.148 -122.201 -122.381 -121.869 -121.864
[316] -122.241 -122.209 -121.874 -121.778 -122.374 -121.876 -122.129
[323] -121.948 -122.366 -122.015 -121.921 -121.776 -122.194 -121.746
[330] -121.862 -122.043 -122.127 -122.014 -122.218 -122.111 -122.042
[337] -122.091 -122.394 -122.179 -121.988 -122.292 -122.445 -121.998
[344] -122.293 -122.080 -122.146 -122.001 -122.360 -122.341 -121.893
[351] -122.000 -122.410 -122.005 -122.282 -121.783 -122.077 -121.905
[358] -122.081 -122.345 -121.980 -122.061 -122.019 -122.482 -121.990
[365] -121.989 -121.979 -122.421 -122.355 -121.769 -122.337 -122.054
[372] -121.845 -122.348 -121.971 -122.110 -121.954 -122.193 -122.069
[379] -122.159 -122.059 -122.101 -122.248 -122.234 -122.236 -122.202
[386] -122.382 -121.908 -122.155 -121.992 -121.745 -122.072 -122.380
[393] -122.258 -122.067 -122.329 -122.326 -122.093 -121.829 -122.090
[400] -121.889 -121.759 -122.228 -121.773 -122.211 -122.039 -122.224
[407] -122.075 -122.012 -122.163 -121.915 -122.118 -122.139 -122.010
[414] -122.325 -122.364 -122.088 -121.997 -122.240 -122.263 -122.078
[421] -122.095 -122.199 -122.405 -122.344 -122.238 -122.239 -122.082
[428] -121.993 -121.763 -121.718 -122.323 -122.107 -122.334 -122.354
[435] -121.860 -122.142 -122.446 -122.183 -122.086 -122.016 -121.901
[442] -121.963 -122.198 -121.977 -122.262 -121.910 -121.756 -122.343
[449] -121.904 -122.026 -121.926 -122.137 -122.333 -122.220 -122.336
[456] -122.406 -121.788 -121.873 -121.870 -121.964 -122.177 -121.899
[463] -121.972 -121.858 -121.716 -121.945 -122.143 -122.063 -122.034
[470] -121.881 -122.296 -122.261 -122.138 -121.887 -121.944 -122.305
[477] -121.959 -122.251 -121.744 -121.801 -122.464 -121.995 -122.028
[484] -122.048 -122.131 -122.145 -121.855 -121.723 -122.044 -121.698
[491] -122.413 -121.789 -121.898 -121.930 -121.758 -121.903 -122.255
[498] -122.249 -121.958 -122.011 -121.986 -122.102 -122.004 -121.859
[505] -122.098 -121.922 -121.754 -121.962 -122.060 -121.956 -122.057
[512] -121.975 -121.866 -122.035 -121.868 -122.416 -122.084 -121.888
[519] -122.079 -121.940 -121.786 -122.006 -121.838 -122.076 -122.443
[526] -121.947 -122.053 -122.207 -121.885 -121.894 -121.768 -121.767
[533] -121.867 -122.438 -121.852 -121.946 -121.877 -121.853 -121.985
[540] -121.771 -121.780 -122.254 -122.002 -121.955 -121.856 -122.514
[547] -121.733 -122.456 -121.850 -121.747 -121.861 -121.918 -121.987
[554] -121.826 -122.096 -121.978 -121.981 -121.891 -121.968 -121.743
[561] -121.405 -122.250 -122.415 -122.407 -121.777 -121.882 -122.465
[568] -121.875 -121.321 -122.411 -121.913 -122.462 -121.911 -122.408
[575] -121.907 -122.398 -121.799 -122.474 -121.960 -121.991 -121.974
[582] -122.008 -121.815 -121.886 -122.401 -121.782 -122.496 -121.917
[589] -121.939 -121.364 -121.730 -121.827 -121.914 -121.315 -122.463
[596] -121.929 -121.775 -121.403 -121.726 -122.451 -121.935 -121.787
[603] -121.892 -122.453 -122.467 -121.719 -122.499 -121.932 -121.949
[610] -122.470 -121.851 -122.447 -121.755 -121.872 -121.900 -121.734
[617] -121.714 -122.472 -121.863 -121.802 -121.847 -121.781 -121.725
[624] -121.822 -121.691 -121.937 -121.800 -121.897 -122.507 -121.711
[631] -121.752 -122.449 -122.439 -121.784 -121.779 -121.953 -121.821
[638] -122.504 -121.795 -121.952 -121.857 -122.450 -121.757 -121.925
[645] -121.749 -121.957 -121.837 -121.884 -122.475 -121.736 -121.713
[652] -121.835 -121.896 -122.502 -122.422 -122.469 -121.316 -122.414
[659] -122.486 -122.505 -121.810 -122.519 -122.497 -121.738 -121.854
[666] -121.906 -121.724 -121.950 -122.454 -122.461 -121.765 -121.735
[673] -121.931 -121.402 -122.458 -122.431 -121.742 -122.455 -122.457
[680] -121.325 -121.927 -122.506 -121.846 -121.895 -121.943 -121.676
[687] -121.701 -121.833 -121.766 -122.484 -121.646 -121.916 -122.432
[694] -121.840 -121.941 -122.420 -121.785 -121.933 -122.515 -122.479
[701] -122.430 -121.924 -122.425 -121.823 -121.721 -121.848 -122.433
[708] -122.440 -122.435 -121.750 -121.727 -121.762 -121.809 -121.834
[715] -121.359 -121.934 -122.459 -121.480 -122.503 -122.490 -121.319
[722] -121.803 -121.417 -121.739 -122.473 -121.352 -121.828 -121.942
[729] -121.731 -121.760 -122.512 -121.708 -121.865 -121.761 -122.448
[736] -121.936 -121.820 -122.509 -121.473 -122.441 -122.511 -121.818
[743] -121.819 -122.460 -121.804 -122.452 -121.831 -121.737 -121.792
[750] -121.849 -121.797
par(mfrow=c(1, 4)) # divide graph area in 4 columns
# Box plot for 'price'
boxplot(hdTrain$price, main="Price", sub=paste("Outlier rows: ", boxplot.stats(hdTrain$price)$out))
# Box plot for 'lat'
boxplot(hdTrain$lat, main="lat", sub=paste("Outlier rows: ", boxplot.stats(hdTrain$lat)$out))
# Box plot for 'bedrooms'
boxplot(hdTrain$bedrooms, main="bedrooms", sub=paste("Outlier rows: ", boxplot.stats(hdTrain$bedrooms)$out))
# Box plot for 'bathrooms'
boxplot(hdTrain$bathrooms, main="bathrooms", sub=paste("Outlier rows: ", boxplot.stats(hdTrain$bathrooms)$out)) Advanced graphs
Showing the evolution of amount of houses over the years
hd %>%
group_by(yr_built) %>%
summarise(n = n()) %>%
ggplot(aes(x = yr_built, y = n)) +
geom_line(color = 'black') +
geom_point(color = 'blue', size = 0.5) +
geom_smooth(method="lm", color = 'green', size = 0.5) +
theme_linedraw() +
theme(plot.title = element_text(hjust = 0,color = 'black')) +
labs(x = 'year', y = 'count',
title = "House sales in 1900 - 2015") +
scale_x_continuous(breaks=seq(1900, 2015, 10))Interactive graph summarizing several variables related to price
plot_ly(y=~price,x=~bedrooms,data = hdTrain,
hoverinfo = "text",text= ~paste("Price: ",price,"\n","Bathrooms:",bathrooms,"\n","Grade: ",
grade ,"\n","Condition: ",condition,"\n","Year Built:",yr_built))%>%
add_markers()%>%
layout(title="Price vs Bedrooms")Correlation plot
library(corrplot,quietly = T)
set.seed(990)
Nindex<-sapply(hd,is.numeric)
Ndata<-hd[,..Nindex]
corrplot(cor(Ndata))4. Data preparation
a. Factorizing
Floors, bathrooms, waterfront and view consist of clear categories. That is why their numeric datatype is changed in to the factorial one.
b. Binarizing
sqft_basement and yr_renovated showed a clear distribution in the EDA, with most of the values being 0. With an ifelse condition all other variables are therefore changed into 1.
# Factorization
hd$floors <- sapply(hd$floors, as.factor)
hd$bathrooms <- sapply(hd$bathrooms, as.factor)
hd$waterfront <- sapply(hd$waterfront, as.factor)
hd$view <- sapply(hd$view, as.factor)
# Binarizing sqft_basement and yr_renovated
hd[,"sqft_basement"] <- ifelse(hd[,"sqft_basement"] > 0,1,0)
hd[,"yr_renovated"] <- ifelse(hd[,"yr_renovated"] > 0,1,0)
# Split data table again
hdTrain <- hd[which(hd$split == "train"),]
hdTest <- hd[which(hd$split == "test"),] # do not touch anymore5. Modelling
Linear regression, random forest and gradient boosting are the models chosen to predict the house prices. There will be three iterations, with in between a Feature Engineering step, aiming to increase the score. In the last modeling phase, the parameters of the two latter models are also tuned.
Splitting function
set.seed(1996)
spliting <- sample(1:3, size = nrow(hdTrain), prob = c(0.7, 0.15, 0.15), replace = TRUE)a. Baseline
formula = price~.
fitControl <- trainControl(method="cv",number = 10)
# Train, validation and testset to score and evaluate models
train0 <- hdTrain[spliting == 1, 3:21]
val0 <- hdTrain[spliting == 2, 3:21]
test0 <- hdTrain[spliting == 3, 3:21]
#Baseline lm
lm_0<-train(formula,data = train0, method = 'lm', trControl = fitControl, metric = 'MAE')
summary(lm_0)
Call:
lm(formula = .outcome ~ ., data = dat)
Residuals:
Min 1Q Median 3Q Max
-2597285 -91689 -10263 74977 2597285
Coefficients: (3 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5672080.02461 3673239.82609 1.544 0.122575
bedrooms -20261.23608 2317.01421 -8.745 < 0.0000000000000002 ***
bathrooms2.5 -10275.91912 8491.23600 -1.210 0.226234
bathrooms3.5 93255.15211 13176.79856 7.077 0.000000000001553 ***
bathrooms1.75 -13491.43569 8147.43885 -1.656 0.097766 .
bathrooms3.25 122454.78649 13631.80362 8.983 < 0.0000000000000002 ***
bathrooms2.25 -2413.03418 9043.53821 -0.267 0.789609
bathrooms3 25036.78812 11983.55101 2.089 0.036705 *
bathrooms2.75 -12857.94320 10679.17843 -1.204 0.228606
bathrooms1 15547.93530 8148.58804 1.908 0.056408 .
bathrooms2 3186.93121 8815.73249 0.362 0.717728
bathrooms6 544907.85816 108706.00957 5.013 0.000000544495624 ***
bathrooms3.75 228328.16459 23085.22153 9.891 < 0.0000000000000002 ***
bathrooms4.5 112213.97108 28787.63076 3.898 0.000097524547988 ***
bathrooms4 228051.59818 23433.40706 9.732 < 0.0000000000000002 ***
bathrooms0.75 50206.97371 29053.68337 1.728 0.084000 .
bathrooms4.75 597666.87828 46943.33273 12.732 < 0.0000000000000002 ***
bathrooms4.25 390613.35912 33039.04635 11.823 < 0.0000000000000002 ***
bathrooms1.25 6847.75524 83176.83066 0.082 0.934388
bathrooms5.75 -227483.81187 132360.41085 -1.719 0.085701 .
bathrooms8 2263048.69293 137157.70092 16.500 < 0.0000000000000002 ***
bathrooms5 403308.10304 57236.24946 7.046 0.000000000001938 ***
bathrooms7.5 NA NA NA NA
bathrooms5.5 1255545.45895 84762.17486 14.813 < 0.0000000000000002 ***
bathrooms5.25 712165.06110 71257.97046 9.994 < 0.0000000000000002 ***
bathrooms7.75 4516039.97034 188056.66270 24.014 < 0.0000000000000002 ***
bathrooms6.5 NA NA NA NA
bathrooms6.25 1115926.00483 133055.86949 8.387 < 0.0000000000000002 ***
bathrooms0.5 -28780.05960 131019.20595 -0.220 0.826137
bathrooms6.75 NA NA NA NA
sqft_living 101.88086 7.93584 12.838 < 0.0000000000000002 ***
sqft_lot 0.14673 0.05936 2.472 0.013451 *
floors3 39735.26520 11969.00783 3.320 0.000903 ***
floors2 13370.30343 5611.58725 2.383 0.017205 *
floors1.5 12502.68765 6785.11368 1.843 0.065403 .
floors2.5 134941.29037 20553.06677 6.566 0.000000000054008 ***
floors3.5 560374.86994 131469.47077 4.262 0.000020380288885 ***
waterfront1 490125.67867 26134.82596 18.754 < 0.0000000000000002 ***
view2 62227.70004 8460.65740 7.355 0.000000000000204 ***
view1 123239.22798 14942.50231 8.248 < 0.0000000000000002 ***
view3 163590.72319 11501.91721 14.223 < 0.0000000000000002 ***
view4 285326.68972 19449.24522 14.670 < 0.0000000000000002 ***
condition 34662.63168 2940.93590 11.786 < 0.0000000000000002 ***
grade 104056.48514 2701.02748 38.525 < 0.0000000000000002 ***
sqft_above 25.09191 8.73222 2.873 0.004067 **
sqft_basement 11887.41879 6695.66593 1.775 0.075859 .
yr_built -2054.81240 98.78115 -20.802 < 0.0000000000000002 ***
yr_renovated 63234.78132 9007.77260 7.020 0.000000000002339 ***
zipcode -535.94144 41.05424 -13.054 < 0.0000000000000002 ***
lat 583213.88079 13288.30428 43.889 < 0.0000000000000002 ***
long -184185.24693 16447.59371 -11.198 < 0.0000000000000002 ***
sqft_living15 44.21389 4.34594 10.174 < 0.0000000000000002 ***
sqft_lot15 -0.41753 0.08736 -4.779 0.000001781370153 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 184900 on 11920 degrees of freedom
Multiple R-squared: 0.7436, Adjusted R-squared: 0.7426
F-statistic: 705.7 on 49 and 11920 DF, p-value: < 0.00000000000000022
pred_lm_0<-predict(lm_0, newdata = test0)
mape_lm_0<-mape(real=test0$price, predicted = pred_lm_0)
mape_lm_0[1] 0.2438174
#Baseline rf
rf_0<-randomForest(formula, data = train0, ntree = 100)
summary(rf_0) Length Class Mode
call 4 -none- call
type 1 -none- character
predicted 11970 -none- numeric
mse 100 -none- numeric
rsq 100 -none- numeric
oob.times 11970 -none- numeric
importance 18 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 11 -none- list
coefs 0 -none- NULL
y 11970 -none- numeric
test 0 -none- NULL
inbag 0 -none- NULL
terms 3 terms call
varImpPlot(rf_0,type=2)pred_rf_0<-predict(rf_0, newdata = test0)
mape_rf_0<-mape(real=test0$price, predicted = pred_rf_0)
mape_rf_0[1] 0.1309606
#Plotting the actual and predicted
ggplot(test0,aes(x=price,y=pred_rf_0))+geom_point()+geom_abline(color="blue")#Baseline GBM
gb_0 <- gbm(formula,data = train0,n.trees = 100,cv.folds = 5)Distribution not specified, assuming gaussian ...
summary(gb_0) var rel.inf
sqft_living sqft_living 40.1830656
grade grade 27.8547928
lat lat 16.0301247
view view 6.5117737
bathrooms bathrooms 5.2482973
waterfront waterfront 2.7416587
sqft_living15 sqft_living15 0.5947020
yr_built yr_built 0.3121457
yr_renovated yr_renovated 0.2769282
condition condition 0.1252483
long long 0.1212630
bedrooms bedrooms 0.0000000
sqft_lot sqft_lot 0.0000000
floors floors 0.0000000
sqft_above sqft_above 0.0000000
sqft_basement sqft_basement 0.0000000
zipcode zipcode 0.0000000
sqft_lot15 sqft_lot15 0.0000000
pred_gb_0<-predict(gb_0, newdata = test0)Using 100 trees...
mape_gb_0<-mape(real=test0$price, predicted = pred_gb_0)
mape_gb_0[1] 0.206808
Feature Engineering 1: extract date information and add it to the datatable
#date transformations and variable extractions
hd <- cbind(hd, hd$date) #copy of date column
hd$date <- mdy(hd$date)
hd$date <-as.numeric(as.Date(hd$date, origin = "1900-01-01"))
colnames(hd)[colnames(hd)=="V2"] <- "copy_date"
hd$copy_date <- as.Date(hd$copy_date, format = '%m/%d/%Y')
hd$month <- month(hd$copy_date)
hd$day <- day(hd$copy_date)
hd$year <- year(hd$copy_date)
unique(hd$year) # check years[1] 2014 2015
str(hd)Classes 'data.table' and 'data.frame': 21420 obs. of 26 variables:
$ id :integer64 9183703376 464000600 2224079050 6163901283 6392003810 7974200948 2426059124 2115510300 ...
$ date : num 16203 16309 16269 16465 16213 ...
$ price : num 225000 641250 810000 330000 530000 ...
$ bedrooms : int 3 3 4 4 4 4 4 3 4 3 ...
$ bathrooms : Factor w/ 29 levels "1.5","2.5","3.5",..: 1 2 3 1 4 3 5 6 2 1 ...
$ sqft_living : int 1250 2220 3980 1890 1814 3120 4160 1440 2250 2540 ...
$ sqft_lot : int 7500 2550 209523 7540 5000 5086 47480 10500 6840 9520 ...
$ floors : Factor w/ 6 levels "1","3","2","1.5",..: 1 2 3 1 1 3 3 1 3 1 ...
$ waterfront : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
$ view : Factor w/ 5 levels "0","2","1","3",..: 1 2 2 1 1 1 1 1 1 1 ...
$ condition : int 3 3 3 4 4 3 3 3 3 3 ...
$ grade : int 7 10 9 7 7 9 10 8 9 8 ...
$ sqft_above : int 1250 2220 3980 1890 944 2480 4160 1130 2250 1500 ...
$ sqft_basement: num 0 0 0 0 1 1 0 1 0 1 ...
$ yr_built : int 1967 1990 2006 1967 1951 2008 1995 1983 1987 1959 ...
$ yr_renovated : num 0 0 0 0 0 0 0 0 0 0 ...
$ zipcode : int 98030 98117 98024 98155 98115 98115 98072 98023 98058 98115 ...
$ lat : num 47.4 47.7 47.6 47.8 47.7 ...
$ long : num -122 -122 -122 -122 -122 ...
$ sqft_living15: int 1260 2200 2220 1890 1290 1880 3400 1510 2480 1870 ...
$ sqft_lot15 : int 7563 5610 65775 8515 5000 5092 40428 8125 7386 6800 ...
$ split : chr "train" "train" "train" "train" ...
$ copy_date : Date, format: "2014-05-13" "2014-08-27" ...
$ month : num 5 8 7 1 5 5 12 10 11 7 ...
$ day : int 13 27 18 30 23 20 16 16 26 21 ...
$ year : num 2014 2014 2014 2015 2014 ...
- attr(*, ".internal.selfref")=<externalptr>
hdTrain <- hd[which(hd$split == "train"),]
hdTest <- hd[which(hd$split == "test"),] # do not touch anymoreb. Modeling phase 2
# Train, validation and testset to score and evaluate models
train1 <- hdTrain[spliting == 1, c(3:21, 24:26)]
val1 <- hdTrain[spliting == 2, c(3:21, 24:26)]
test1 <- hdTrain[spliting == 3, c(3:21, 24:26)]
#LM with FE1
lm_1<-train(formula,data = train1, method = 'lm', trControl = fitControl, metric = 'MAE')
summary(lm_1)
Call:
lm(formula = .outcome ~ ., data = dat)
Residuals:
Min 1Q Median 3Q Max
-2590561 -91217 -9452 74462 2590561
Coefficients: (3 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -73926807.59095 12345532.56763 -5.988 0.000000002183665464
bedrooms -20293.68251 2310.59099 -8.783 < 0.0000000000000002
bathrooms2.5 -10252.83910 8467.94690 -1.211 0.226003
bathrooms3.5 92693.56029 13139.59505 7.055 0.000000000001827827
bathrooms1.75 -13726.84965 8125.25931 -1.689 0.091168
bathrooms3.25 123511.48845 13593.72820 9.086 < 0.0000000000000002
bathrooms2.25 -3342.09419 9018.52522 -0.371 0.710956
bathrooms3 24938.58270 11951.19968 2.087 0.036936
bathrooms2.75 -12247.95942 10649.37764 -1.150 0.250122
bathrooms1 15150.94520 8125.93821 1.865 0.062274
bathrooms2 3194.25405 8791.17712 0.363 0.716352
bathrooms6 532937.90633 108406.79214 4.916 0.000000894645045081
bathrooms3.75 228155.90041 23020.09153 9.911 < 0.0000000000000002
bathrooms4.5 110721.66842 28709.63832 3.857 0.000116
bathrooms4 223536.92199 23374.20011 9.563 < 0.0000000000000002
bathrooms0.75 50869.20248 28973.50922 1.756 0.079163
bathrooms4.75 599123.83009 46817.18496 12.797 < 0.0000000000000002
bathrooms4.25 388695.07208 32946.32182 11.798 < 0.0000000000000002
bathrooms1.25 6717.72519 82958.91020 0.081 0.935462
bathrooms5.75 -228845.63104 132021.49247 -1.733 0.083051
bathrooms8 2271127.19348 136783.78612 16.604 < 0.0000000000000002
bathrooms5 409644.55368 57086.56495 7.176 0.000000000000761208
bathrooms7.5 NA NA NA NA
bathrooms5.5 1262005.49323 84527.07936 14.930 < 0.0000000000000002
bathrooms5.25 722112.89345 71066.19646 10.161 < 0.0000000000000002
bathrooms7.75 4522574.12325 187524.59229 24.117 < 0.0000000000000002
bathrooms6.5 NA NA NA NA
bathrooms6.25 1126217.14029 132712.59122 8.486 < 0.0000000000000002
bathrooms0.5 -33152.48902 130657.29507 -0.254 0.799704
bathrooms6.75 NA NA NA NA
sqft_living 101.99207 7.91331 12.889 < 0.0000000000000002
sqft_lot 0.13805 0.05920 2.332 0.019728
floors3 40270.76650 11937.22172 3.374 0.000744
floors2 13896.05520 5596.37502 2.483 0.013040
floors1.5 12854.88457 6766.29544 1.900 0.057478
floors2.5 136233.94943 20496.68231 6.647 0.000000000031294489
floors3.5 569758.27569 131108.18989 4.346 0.000013997559685484
waterfront1 486717.04852 26064.57096 18.674 < 0.0000000000000002
view2 61637.14859 8437.29441 7.305 0.000000000000294234
view1 122091.41205 14901.82220 8.193 0.000000000000000281
view3 162973.27565 11470.41942 14.208 < 0.0000000000000002
view4 285625.04663 19397.49464 14.725 < 0.0000000000000002
condition 36242.42727 2938.66774 12.333 < 0.0000000000000002
grade 104298.35361 2694.51130 38.708 < 0.0000000000000002
sqft_above 24.78066 8.70840 2.846 0.004440
sqft_basement 11709.04984 6676.75481 1.754 0.079507
yr_built -2049.30921 98.50793 -20.803 < 0.0000000000000002
yr_renovated 65931.97411 8990.09241 7.334 0.000000000000238116
zipcode -535.05506 40.94193 -13.069 < 0.0000000000000002
lat 585386.41342 13253.34701 44.169 < 0.0000000000000002
long -183812.68398 16407.15088 -11.203 < 0.0000000000000002
sqft_living15 44.72548 4.33422 10.319 < 0.0000000000000002
sqft_lot15 -0.40804 0.08713 -4.683 0.000002853429975583
month 1898.46611 875.22038 2.169 0.030093
day 17.30619 196.16046 0.088 0.929700
year 39428.97603 5841.33570 6.750 0.000000000015470838
(Intercept) ***
bedrooms ***
bathrooms2.5
bathrooms3.5 ***
bathrooms1.75 .
bathrooms3.25 ***
bathrooms2.25
bathrooms3 *
bathrooms2.75
bathrooms1 .
bathrooms2
bathrooms6 ***
bathrooms3.75 ***
bathrooms4.5 ***
bathrooms4 ***
bathrooms0.75 .
bathrooms4.75 ***
bathrooms4.25 ***
bathrooms1.25
bathrooms5.75 .
bathrooms8 ***
bathrooms5 ***
bathrooms7.5
bathrooms5.5 ***
bathrooms5.25 ***
bathrooms7.75 ***
bathrooms6.5
bathrooms6.25 ***
bathrooms0.5
bathrooms6.75
sqft_living ***
sqft_lot *
floors3 ***
floors2 *
floors1.5 .
floors2.5 ***
floors3.5 ***
waterfront1 ***
view2 ***
view1 ***
view3 ***
view4 ***
condition ***
grade ***
sqft_above **
sqft_basement .
yr_built ***
yr_renovated ***
zipcode ***
lat ***
long ***
sqft_living15 ***
sqft_lot15 ***
month *
day
year ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 184400 on 11917 degrees of freedom
Multiple R-squared: 0.7452, Adjusted R-squared: 0.7441
F-statistic: 670.1 on 52 and 11917 DF, p-value: < 0.00000000000000022
pred_lm_1<-predict(lm_1, newdata = test1)
mape_lm_1<-mape(real=test1$price, predicted = pred_lm_1)
mape_lm_1[1] 0.2441572
#RF with FE1
rf_1<-randomForest(formula, data = train1, ntree = 100)
summary(rf_1) Length Class Mode
call 4 -none- call
type 1 -none- character
predicted 11970 -none- numeric
mse 100 -none- numeric
rsq 100 -none- numeric
oob.times 11970 -none- numeric
importance 21 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 11 -none- list
coefs 0 -none- NULL
y 11970 -none- numeric
test 0 -none- NULL
inbag 0 -none- NULL
terms 3 terms call
varImpPlot(rf_1,type=2)pred_rf_1<-predict(rf_1, newdata = test1)
mape_rf_1<-mape(real=test1$price, predicted = pred_rf_1)
mape_rf_1[1] 0.1320853
#Plotting the actual and predicted
ggplot(test1,aes(x=price,y=pred_rf_1))+geom_point()+geom_abline(color="blue")#GBM with FE 1
gb_1 <- gbm(formula,data = train1,n.trees = 100,cv.folds = 5)Distribution not specified, assuming gaussian ...
summary(gb_1) var rel.inf
sqft_living sqft_living 39.1879971
grade grade 27.6046698
lat lat 15.2434675
view view 7.2956360
bathrooms bathrooms 5.4210370
waterfront waterfront 2.8424284
sqft_above sqft_above 1.2279090
yr_built yr_built 0.3977170
sqft_living15 sqft_living15 0.3235966
yr_renovated yr_renovated 0.2050863
long long 0.1299746
condition condition 0.1204806
bedrooms bedrooms 0.0000000
sqft_lot sqft_lot 0.0000000
floors floors 0.0000000
sqft_basement sqft_basement 0.0000000
zipcode zipcode 0.0000000
sqft_lot15 sqft_lot15 0.0000000
month month 0.0000000
day day 0.0000000
year year 0.0000000
pred_gb_1<-predict(gb_1, newdata = test1)Using 100 trees...
mape_gb_1<-mape(real=test1$price, predicted = pred_gb_1)
mape_gb_1[1] 0.2088095
Feature Engineering 2: delete as seen in the EDA, the outlier of 33 bedrooms with a relatively low price
#Remove outlier
plot(table(train1$bedrooms))train2 <- subset(train1, bedrooms < 12)
hdTrain <- hd[which(hd$split == "train"),]
hdTest <- hd[which(hd$split == "test"),] # do not touch anymorec. Modeling phase 3
Experimenting with parameter tuning for RF and GBM
#LM with FE2
lm_2<-train(formula,data = train2, method = 'lm', trControl = fitControl, metric = 'MAE')
summary(lm_2)
Call:
lm(formula = .outcome ~ ., data = dat)
Residuals:
Min 1Q Median 3Q Max
-2588982 -91049 -9501 74416 2588982
Coefficients: (3 not defined because of singularities)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -74092703.92127 12336535.48950 -6.006 0.000000001957517727
bedrooms -24289.78034 2488.76201 -9.760 < 0.0000000000000002
bathrooms2.5 -9166.69310 8465.50049 -1.083 0.27891
bathrooms3.5 94438.38489 13136.21857 7.189 0.000000000000690856
bathrooms1.75 -13910.04052 8119.40985 -1.713 0.08670
bathrooms3.25 124918.49031 13587.69226 9.194 < 0.0000000000000002
bathrooms2.25 -2510.93576 9013.97972 -0.279 0.78059
bathrooms3 27141.60274 11953.40705 2.271 0.02319
bathrooms2.75 -10196.26251 10652.24734 -0.957 0.33849
bathrooms1 13806.97645 8125.98471 1.699 0.08932
bathrooms2 3683.11015 8785.46245 0.419 0.67506
bathrooms6 531362.05408 108327.87840 4.905 0.000000945999008011
bathrooms3.75 230066.25119 23007.48895 10.000 < 0.0000000000000002
bathrooms4.5 113787.00639 28697.42361 3.965 0.000073806283397607
bathrooms4 226333.23001 23366.09519 9.686 < 0.0000000000000002
bathrooms0.75 47377.18583 28963.63056 1.636 0.10192
bathrooms4.75 600890.49191 46784.63988 12.844 < 0.0000000000000002
bathrooms4.25 389574.79487 32922.78575 11.833 < 0.0000000000000002
bathrooms1.25 4778.78898 82899.27235 0.058 0.95403
bathrooms5.75 -224831.35160 131927.93466 -1.704 0.08837
bathrooms8 2262020.76940 136699.82602 16.547 < 0.0000000000000002
bathrooms5 413504.33237 57051.73914 7.248 0.000000000000449472
bathrooms7.5 NA NA NA NA
bathrooms5.5 1259575.11992 84466.95516 14.912 < 0.0000000000000002
bathrooms5.25 726190.24685 71020.38355 10.225 < 0.0000000000000002
bathrooms7.75 4519602.87734 187388.28697 24.119 < 0.0000000000000002
bathrooms6.5 NA NA NA NA
bathrooms6.25 1121678.93942 132619.42210 8.458 < 0.0000000000000002
bathrooms0.5 -36744.20452 130564.10744 -0.281 0.77839
bathrooms6.75 NA NA NA NA
sqft_living 105.00381 7.93843 13.227 < 0.0000000000000002
sqft_lot 0.13422 0.05917 2.269 0.02331
floors3 39335.84351 11930.44366 3.297 0.00098
floors2 13521.47760 5592.94708 2.418 0.01564
floors1.5 13927.96315 6765.93134 2.059 0.03956
floors2.5 136549.57184 20481.77624 6.667 0.000000000027282846
floors3.5 571633.35175 131012.72688 4.363 0.000012925838185840
waterfront1 485281.35436 26047.58678 18.631 < 0.0000000000000002
view2 61003.21939 8432.39213 7.234 0.000000000000496278
view1 121525.90246 14891.46968 8.161 0.000000000000000366
view3 162127.36096 11463.69079 14.143 < 0.0000000000000002
view4 284617.89446 19384.67750 14.683 < 0.0000000000000002
condition 36062.66535 2936.80909 12.280 < 0.0000000000000002
grade 103698.25759 2696.14578 38.462 < 0.0000000000000002
sqft_above 24.16392 8.70320 2.776 0.00550
sqft_basement 11175.43353 6673.00944 1.675 0.09402
yr_built -2056.24844 98.44888 -20.886 < 0.0000000000000002
yr_renovated 65506.76965 8984.04056 7.291 0.000000000000326009
zipcode -537.89825 40.91723 -13.146 < 0.0000000000000002
lat 584237.10727 13246.31825 44.106 < 0.0000000000000002
long -184081.47388 16395.23277 -11.228 < 0.0000000000000002
sqft_living15 44.71400 4.33104 10.324 < 0.0000000000000002
sqft_lot15 -0.41375 0.08707 -4.752 0.000002039470433147
month 1924.09057 874.59855 2.200 0.02783
day 5.99149 196.03419 0.031 0.97562
year 39674.10655 5837.32832 6.797 0.000000000011220734
(Intercept) ***
bedrooms ***
bathrooms2.5
bathrooms3.5 ***
bathrooms1.75 .
bathrooms3.25 ***
bathrooms2.25
bathrooms3 *
bathrooms2.75
bathrooms1 .
bathrooms2
bathrooms6 ***
bathrooms3.75 ***
bathrooms4.5 ***
bathrooms4 ***
bathrooms0.75
bathrooms4.75 ***
bathrooms4.25 ***
bathrooms1.25
bathrooms5.75 .
bathrooms8 ***
bathrooms5 ***
bathrooms7.5
bathrooms5.5 ***
bathrooms5.25 ***
bathrooms7.75 ***
bathrooms6.5
bathrooms6.25 ***
bathrooms0.5
bathrooms6.75
sqft_living ***
sqft_lot *
floors3 ***
floors2 *
floors1.5 *
floors2.5 ***
floors3.5 ***
waterfront1 ***
view2 ***
view1 ***
view3 ***
view4 ***
condition ***
grade ***
sqft_above **
sqft_basement .
yr_built ***
yr_renovated ***
zipcode ***
lat ***
long ***
sqft_living15 ***
sqft_lot15 ***
month *
day
year ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 184300 on 11916 degrees of freedom
Multiple R-squared: 0.7456, Adjusted R-squared: 0.7444
F-statistic: 671.5 on 52 and 11916 DF, p-value: < 0.00000000000000022
pred_lm_2<-predict(lm_2, newdata = test1)
mape_lm_2<-mape(real=test1$price, predicted = pred_lm_2)
mape_lm_2[1] 0.2441382
#RF with FE2 and parameter tuning
rf_2<-randomForest(formula, data = train2, ntree = 200, mtry = 4, nodesize = 7)
summary(rf_2) Length Class Mode
call 6 -none- call
type 1 -none- character
predicted 11969 -none- numeric
mse 200 -none- numeric
rsq 200 -none- numeric
oob.times 11969 -none- numeric
importance 21 -none- numeric
importanceSD 0 -none- NULL
localImportance 0 -none- NULL
proximity 0 -none- NULL
ntree 1 -none- numeric
mtry 1 -none- numeric
forest 11 -none- list
coefs 0 -none- NULL
y 11969 -none- numeric
test 0 -none- NULL
inbag 0 -none- NULL
terms 3 terms call
varImpPlot(rf_2,type=2)pred_rf_2<-predict(rf_2, newdata = test1)
mape_rf_2<-mape(real=test1$price, predicted = pred_rf_2)
mape_rf_2[1] 0.1384995
#Plotting the actual and predicted
ggplot(test1,aes(x=price,y=pred_rf_2))+geom_point()+geom_abline(color="blue")#GBM with FE2 and parameter tuning
gb_2 <- gbm(formula,data = train2,n.trees = 200, interaction.depth = 2, shrinkage = 0.08, cv.folds = 5)Distribution not specified, assuming gaussian ...
summary(gb_2) var rel.inf
sqft_living sqft_living 30.65771949
grade grade 25.61927821
lat lat 14.16426632
bathrooms bathrooms 8.40253201
view view 6.14925684
long long 4.91186932
sqft_living15 sqft_living15 2.89843267
waterfront waterfront 2.12003468
yr_built yr_built 1.54577963
zipcode zipcode 1.26608828
sqft_above sqft_above 1.18598924
condition condition 0.37171398
yr_renovated yr_renovated 0.21827926
sqft_lot sqft_lot 0.12753421
sqft_lot15 sqft_lot15 0.10923666
month month 0.09220563
year year 0.07566302
bedrooms bedrooms 0.04141828
day day 0.02255447
floors floors 0.02014782
sqft_basement sqft_basement 0.00000000
pred_gb_2<-predict(gb_2, newdata = test1)Using 196 trees...
mape_gb_2<-mape(real=test1$price, predicted = pred_gb_2)
mape_gb_2[1] 0.169993
Parameter tuning decreased the score of the gradient boosting model drastically, but the random forest score became worse.
Extract best score
table <- c(mape_lm_0,mape_lm_1,mape_lm_2,
mape_rf_0, mape_rf_1, mape_rf_2,
mape_gb_0, mape_gb_1, mape_gb_2)
best_score <- min(table)MAPE rf_0: 0.1309606
The random forest MAPE of our baseline model is the best score
6. Score Testset
With the randomforest baseline model, the test set is scored and the prediction file saved.
final_test<-predict(rf_0, newdata = hdTest)
hdTest$price <- final_test
Gerald_Walravens_Project <- hdTest[, c(1, 3)]
write.csv(Gerald_Walravens_Project, "Gerald_Walravens_Project.csv")